# Import custom packages & modules
import sys
sys.path.append("..")
# Univariate analysis module
from src.analyzer.univariate import *
# Multivariate analysis module
from src.analyzer.multivariate import *
# Data cleaning module
from src.datacleaner import *
# Data preprocessing module
from src.preprocessor import *
# Model evaluation module
from src.evaluator import *
# Ensemble-based methods visualization module
from src.modelizer.ensemble.tree_interpreter import *
# Model selection
from sklearn.model_selection import train_test_split
# Linear models
from sklearn.linear_model import ElasticNet, ElasticNetCV
# Non-linear models
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
# Ensemble-based methods
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Compute training time
import datetime
# Model evaluation wrappers
# Define project scorers
third_project_scorers = ['neg_root_mean_squared_error', 'r2']
def train_gridsearch(data, model, param_grid, metric=third_project_scorers, k=10, p=3, v=True):
# Model name
model_label = model_name(model)
# Get training & testing data
x_train, y_train = data['train']
x_test, y_test = data['test']
# Define refit condition (first metric if evaluationg multiple metrics else False)
refit_cond = metric[0] if type(metric) is list else False
# Build grid search
gridsearch = GridSearchCV(model, param_grid, cv=k, scoring=metric, refit=refit_cond)
# Time the model training
start_training = datetime.datetime.now()
# Train model with grid search
gridsearch.fit(x_train, y_train)
end_training = datetime.datetime.now()
# Compute training time
training_time = end_training - start_training
# Format training time
training_time_str = format_run_time(training_time)
# Trained_model
trained_model = gridsearch.best_estimator_
# Get scores from cross validation
cv_scores = {}
for scorer_label in (metric if type(metric) is list else [metric]):
if scorer_label.startswith('neg'):
formatted_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
formatted_score = round(np.abs(gridsearch.cv_results_[f'mean_test_{scorer_label}'])[0], p)
cv_scores[formatted_label] = formatted_score
else:
cv_scores[scorer_label] = round(gridsearch.cv_results_[f'mean_test_{scorer_label}'][0], p)
# Get scores from testing set
testing_set_scores = get_model_scores(trained_model, x_test, y_test, list(cv_scores.keys()), p, v)
# Display cross validation mean scores
if v:
print_score_results(cv_scores, set_type='train')
# Build model dictionary which contains GridSearchCV & model instances (with model name)
model_data = {'gs': gridsearch, # GridSearchCV trained instance
'model': trained_model, # Model trained instance
'model_name': model_label} # Model name
# Build additional evaluation data dictionary
additional_evaluation_data = {'time': training_time_str, # Training time
'n_features': x_train.shape[1], # Selected features
'learning_potential': None} # Learning potential
# Build results dictionary (merge dictionnaries)
results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
return results
# To do : refacto (cumsum is a "threshold")
def select_most_important_features(features_coefs_df,
n=None,
method='cumsum',
model=None,
thr='mean',
q_value=0.5,
thr_value=None,
v=False):
"""
Wrapper which select most important features from machine learning model, based on specified method
(cumulative threshold or)
"""
# Feature importance selection method
if method is 'cumsum': # if thr is 'cumsum'
n_mif_data = features_coefs_df.iloc[:n]
n_mif_labels = n_mif_data['feature'].tolist()
elif method is 'threshold': # else
# Threshold method
if thr is 'q': # quantile
mif_thr = features_coefs_df['coefficient'].quantile(q=q_value)
elif thr is 'mean': # mean
mif_thr = features_coefs_df['coefficient'].mean()
else: # arbitrary number
mif_thr = thr_value
# N.B : function from mlearn preprocessor module
n_mif_data = filter_features_by_threshold(X,
X_train_std,
X_test_std,
model,
mif_thr,
verbose=v)
n_mif_labels = n_mif_data['labels']
# Return n selected features dataframe & labels
return n_mif_data, n_mif_labels
def run_training_cycle(features_reduced,
model_param_grid,
model=None,
target='energy',
training_wrapper='gs',
k=5,
s=third_project_scorers,
p=3,
v=True):
"""
Wrapper which run a training cycle based on reduced features
"""
# Filter training and testing features data (reduce the number of features)
X_train_std_reduced = X_train_std_df.loc[:, features_reduced]
X_test_std_reduced = X_test_std_df.loc[:, features_reduced]
# Define target type
if target is 'energy':
y_train_target, y_test_target = y_train_energy, y_test_energy
elif target is 'emissions':
y_train_target, y_test_target = y_train_emissions, y_test_emissions
# Build training & testing data dictionary for each target
training_and_testing_data_reduced = {"train": [X_train_std_reduced, y_train_target],
"test": [X_test_std_reduced, y_test_target]}
# Define training wrapper type (elastic net cross val or gridsearchCV)
if training_wrapper == 'en':
results = train_elastic_net(training_and_testing_data_reduced,
model_param_grid)
elif training_wrapper == 'gs':
results = train_gridsearch(training_and_testing_data_reduced,
model,
model_param_grid,
metric=s,
k=k,
p=p,
v=v)
return results, training_and_testing_data_reduced
# Elastic Net Wrapper
def train_elastic_net(data, params_grid, k=10, v=True, s=['rmse', 'r2']):
# Get training and testing sets from data dictionary
X_train_std, y_train = data["train"]
X_test_std, y_test = data["test"]
# Get (hyper)parameters from parameters grid
alpha_range, l1_ratio_range = params_grid.values()
# Build model using cross validation
elastic_net = ElasticNetCV(alphas=alpha_range,
cv=k,
l1_ratio=l1_ratio_range)
start_training = datetime.datetime.now()
# Train model with cross validation
elastic_net.fit(X_train_std, y_train)
end_training = datetime.datetime.now()
# Compute training time
training_time = end_training - start_training
# Format training time
training_time_str = format_run_time(training_time)
# Optimal parameters
optimal_alpha = elastic_net.alpha_
optimal_l1_ratio = elastic_net.l1_ratio_
if v:
print("Alpha : {} | l1_ratio : {}\n".format(optimal_alpha,
optimal_l1_ratio))
# Regularization path
alphas, coefs, _ = elastic_net.path(X_train_std,
y_train,
alphas=alpha_range,
l1_ratio=optimal_l1_ratio)
# Format scorer label (example : neg_root_mean_squared_error --> rmse)
formatted_s = []
for scorer_label in s:
if scorer_label.startswith('neg'):
scorer_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
formatted_s.append(scorer_label)
# Model evaluation
testing_set_scores = get_model_scores(elastic_net,
X_test_std,
y_test,
scorer=formatted_s,
verbose=v)
# Build model dictionary which contains model data (instance & name)
model_data = {'model': elastic_net,
'model_name': model_name(elastic_net)}
# Build additional evaluation data dictionary
additional_evaluation_data = {'time': training_time_str, # Training time
'n_features': X_train_std.shape[1], # Selected features
'learning_potential': None, # Learning potential
'best_params': [optimal_alpha, optimal_l1_ratio],
'reg_path_data': [alphas, coefs]}
# Build results dictionary (merge dictionnaries)
results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
return results
# Import du dataset
df_raw = pd.read_csv('../data/csv/seattle_model_data_no_ENERGYSTARScore.csv')
df = df_raw.copy()
print(df.shape)
df.head(2)
# Targets
targets_cols = ['SiteEnergyUse(kBtu)', 'TotalGHGEmissions']
# Features
features_cols = [col for col in df.columns if col not in targets_cols]
# Features data
df_without_targets = df[features_cols]
# N.B : functions from mlearn prepocessor module
# Method :
# null_variance_cols = features_with_null_variances(df, verbose=True)
# identical_variance_cols = features_with_identical_variances(df, col_kept='last', verbose=True)
# invalid_variances_cols = null_variance_cols + identical_variance_cols
# df = df[[col for col in df.columns if col not in invalid_variances_cols]]
# Remove invalid features from dataframe
df = filter_invalid_variances(df, feature_kept='last', v=True)
# N.B : functions from mlearn prepocessor module
# Get filtered features dataframe
features = filter_correlated_features(df_without_targets, threshold=0.5, verbose=True)
# features = df_without_targets
features.shape
# Remove energy variables
delete_cols(features, ['Main_energy_electricity', 'Main_energy_steam'])
# Features data
X = features
# Extract features labels
training_features = X.columns.tolist()
# Targets data
y = df[targets_cols]
# Training & testing sets split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Split targets (energy consumption & gaz emissions features)
y_train_energy, y_train_emissions = [y_train[target] for target in targets_cols]
y_test_energy, y_test_emissions = [y_test[target] for target in targets_cols]
# Standardize data (center & reduce)
# N.B : function from mlearn preprocessor module
X_train_std, std_scaler = standard_scaler(X_train, return_std_scaler=True)
X_test_std = std_scaler.transform(X_test)
# Build training & testing data dictionary for each target
energy_data = {"train": [X_train_std, y_train_energy],
"test": [X_test_std, y_test_energy]}
emissions_data = {"train": [X_train_std, y_train_emissions],
"test": [X_test_std, y_test_emissions]}
# Build dataframes from standardized training & testing data
# => permet de faciliter le filtrage des features qui contribuent le plus au model
X_train_std_df = pd.DataFrame(X_train_std, columns=training_features)
X_test_std_df = pd.DataFrame(X_test_std, columns=training_features)
y_pred_dum_energy, y_test_dum_energy = dummy_regression(X_train_std,
y_train_energy,
X_test_std,
y_test_energy,
strategy='mean')
print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_energy,
y_pred_dum_energy,
squared=False)))
print('Dummy regression : R2 = {}'.format(r2_score(y_test_energy, y_pred_dum_energy)))
y_pred_dum_emissions, y_test_dum_emissions = dummy_regression(X_train_std,
y_train_emissions,
X_test_std,
y_test_emissions,
strategy='mean')
print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_emissions,
y_pred_dum_emissions,
squared=False)))
print('Dummy regression : R2 = {}'.format(r2_score(y_test_emissions, y_pred_dum_emissions)))
For each target we carry out the following cycle :
# alphas range : np.logspace(-3, 3, 10), np.arange(0.001, 0.009, 0.001)
enet_en_params_grid = {"alphas": np.logspace(-3, 3, 10),
"l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}
enet_en_data = train_elastic_net(energy_data, enet_en_params_grid)
en_alphas, en_coefs = enet_en_data["reg_path_data"]
# N.B : function from mlearn evaluator module
plot_regularization_path(en_alphas, en_coefs.T, training_features, n_features_labels=10)
# N.B : Elastic Net reduces the coefficients of irrelevant features to 0.
# We have therefore selected here features with positive or negative coefficients
elastic_net_en_coefs = enet_en_data["model"].coef_
elastic_net_en_features_coefs_df = get_features_importance(training_features,
elastic_net_en_coefs,
abs_coefs=True,
non_zero_coefs=True,
verbose=True)
# N.B : function from mlearn preprocessor module
plot_cumulative_features_importance(elastic_net_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
# Select the 64 most important features
enet_en_mif_61_data = elastic_net_en_features_coefs_df.iloc[:61]
enet_en_mif_61_labels = enet_en_mif_61_data['feature'].tolist()
# Visualize n most important features
# N.B : function from mlearn preprocessor module
plot_n_top_features(enet_en_mif_61_data,
model_name(enet_en_data["model"]),
n=10,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 61)
X_train_std_en_61f = X_train_std_df.loc[:, enet_en_mif_61_labels]
X_test_std_en_61f = X_test_std_df.loc[:, enet_en_mif_61_labels]
# Build training & testing data dictionary for each target
en_data_reduced_61f = {"train": [X_train_std_en_61f, y_train_energy],
"test": [X_test_std_en_61f, y_test_energy]}
enet_en_reduced_data_61f = train_elastic_net(en_data_reduced_61f, enet_en_params_grid)
# We use the same model (from first training cycle)
mif_q3 = elastic_net_en_features_coefs_df['coefficient'].quantile(q=0.75)
ffd = filter_features_by_threshold(X, X_train_std, X_test_std, enet_en_data['model'], mif_q3)
print('Features selected : {}'.format(len(ffd['labels'])))
enet_en_mif_31_labels = ffd['labels']
# Second training cycle with features reduced (173 -> 31)
X_train_std_en_31f = X_train_std_df.loc[:, enet_en_mif_31_labels]
X_test_std_en_31f = X_test_std_df.loc[:, enet_en_mif_31_labels]
# Build training & testing data dictionary for each target
en_data_reduced_31f = {"train": [X_train_std_en_31f, y_train_energy],
"test": [X_test_std_en_31f, y_test_energy]}
enet_en_reduced_data_31f = train_elastic_net(en_data_reduced_31f, enet_en_params_grid)
# Cumulative feature importance selection method seems to be slightly better
# (fewer variables selected and a slightly better r2)
# Plot training curve (best model selected by a trade-off between total features & score (RMSE, R2))
# N.B : function from mlearn evaluator module
plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
X_train_std_en_31f,
y_train_energy,
'alpha',
np.logspace(-3, 3, 10),
log_scale=True,
scorer='neg_root_mean_squared_error')
# Plot learning curve (best model selected by a trade-off between total features & score (RMSE, R2))
# N.B : function from mlearn evaluator module
plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
X_train_std_en_31f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
# Model performance seems to stagnate from around 4 000 observations
enet_en_reduced_data_31f['learning_potential'] = 'No'
N.B : From here we will use wrappers functions in order to :
# Train model
enet_em_params_grid = {"alphas": np.logspace(-3, 3, 10),
"l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}
enet_em_data = train_elastic_net(emissions_data, enet_em_params_grid)
em_alphas, em_coefs = enet_em_data["reg_path_data"]
plot_regularization_path(em_alphas, em_coefs.T, training_features, n_features_labels=10)
elastic_net_em_coefs = enet_em_data["model"].coef_
elastic_net_em_features_coefs_df = get_features_importance(training_features,
elastic_net_em_coefs,
abs_coefs=True,
non_zero_coefs=True,
verbose=True)
plot_cumulative_features_importance(elastic_net_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
enet_em_mif_70_data, enet_em_mif_70_labels = select_most_important_features(elastic_net_em_features_coefs_df,
n=70,
method='cumsum')
# Visualize n most important features
plot_n_top_features(enet_em_mif_70_data,
model_name(enet_em_data["model"]),
n=10,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 70)
# en_params_grid = {"alphas": np.arange(0.0020, 0.0030, 0.0001),
# "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}
enet_em_reduced_data_70f, train_and_test_data_reduced_70f = run_training_cycle(enet_em_mif_70_labels,
enet_em_params_grid,
training_wrapper='en')
enet_em_mif_n_data, enet_em_mif_n_labels = select_most_important_features(elastic_net_em_features_coefs_df,
method='threshold',
model=enet_em_data["model"],
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 36)
enet_em_reduced_data_36f, train_and_test_data_reduced_36f = run_training_cycle(enet_em_mif_n_labels,
enet_em_params_grid,
training_wrapper='en')
# Plot training curve
x_train_std_em_36f = train_and_test_data_reduced_36f['train'][0]
plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
x_train_std_em_36f,
y_train_emissions,
'alpha',
np.logspace(-3, 3, 10),
log_scale=True,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
x_train_std_em_36f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
# The curve stagnates around 3500 observations
enet_em_reduced_data_36f['learning_potential'] = 'No'
%%time
# Random Forest
rfr_model = RandomForestRegressor(random_state=42)
# n_estimators range | max_depth range
# np.arange(200, 550, 50) | [2, 4, 6, 8, 10]
# [400, 500, 600]
rfr_en_params = {'n_estimators': [100], # 250
# 'max_depth': [150],
# 'max_features': ['auto', 'sqrt', 'log2']
'min_samples_leaf': [1]}
rfr_en_data = train_gridsearch(energy_data,
rfr_model,
rfr_en_params,
k=10)
# print(rfr_en_data['model'])
rfr_en_model = rfr_en_data['model']
rfr_en_coefs = rfr_en_model.feature_importances_
rfr_en_features_coefs_df = get_features_importance(training_features,
rfr_en_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(rfr_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
rfr_en_mif_12_data, rfr_en_mif_12_labels = select_most_important_features(rfr_en_features_coefs_df,
n=12,
method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_en_mif_12_data,
model_name(rfr_en_model),
n=10,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 12)
rfr_en_model_c2 = RandomForestRegressor(random_state=42)
rfr_en_reduced_data_12f, train_and_test_data_reduced_12f = run_training_cycle(rfr_en_mif_12_labels,
rfr_en_params,
rfr_en_model_c2)
# Display naiv decision tree
rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_12f['train'])
display_decision_tree(rfr_en_naiv_model,
rfr_en_mif_12_labels,
targets_cols[0],
tree_nb=5)
rfr_en_mif_n_data, rfr_en_mif_n_labels = select_most_important_features(rfr_en_features_coefs_df,
method='threshold',
model=rfr_en_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 43)
rfr_en_model_c2 = RandomForestRegressor(random_state=42, n_jobs=-1)
rfr_en_reduced_data_43f, train_and_test_data_reduced_43f = run_training_cycle(rfr_en_mif_n_labels,
rfr_en_params,
rfr_en_model_c2)
# Display naiv decision tree
rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_43f['train'])
display_decision_tree(rfr_en_naiv_model,
rfr_en_mif_n_labels,
targets_cols[0],
tree_nb=5)
# Plot training curve
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]
plot_validation_curve(RandomForestRegressor(n_estimators=100, random_state=42),
x_train_std_en_12f,
y_train_energy,
'n_estimators',
np.arange(50, 500, 50),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
x_train_std_en_12f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
rfr_en_reduced_data_12f['learning_potential'] = 'Yes'
%%time
# Random Forest
rfr_model = RandomForestRegressor(random_state=42) #,n_jobs=-1)
# n_estimators range | max_depth range
# np.arange(200, 550, 50) | [2, 4, 6, 8, 10]
# [400, 500, 600]
rfr_em_params = {'n_estimators': [350], #
# 'max_depth': [40],
# 'max_features': ['auto', 'sqrt', 'log2']
'min_samples_leaf': [1]}
rfr_em_data = train_gridsearch(emissions_data,
rfr_model,
rfr_em_params,
k=10)
print(rfr_em_data['model'])
"""
- RMSE = 0.726 722
- R2 = 0.762
"""
rfr_em_model = rfr_em_data['model']
rfr_em_coefs = rfr_em_model.feature_importances_
rfr_em_features_coefs_df = get_features_importance(training_features,
rfr_em_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(rfr_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
rfr_em_mif_18_data, rfr_em_mif_18_labels = select_most_important_features(rfr_em_features_coefs_df,
n=18,
method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_em_mif_18_data,
model_name(rfr_em_model),
n=10,
plot_size=(12, 4))
# Second training cycle with features reduced (173 -> 18)
rfr_em_model_c2 = RandomForestRegressor(random_state=42, n_jobs=-1)
rfr_em_reduced_data_18f, train_and_test_data_reduced_18f = run_training_cycle(rfr_em_mif_18_labels,
rfr_em_params,
rfr_em_model_c2)
# Display naiv decision tree
rfr_em_naiv_model = RandomForestRegressor(n_estimators=350, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_18f['train'])
display_decision_tree(rfr_em_naiv_model,
rfr_em_mif_18_labels,
targets_cols[1],
tree_nb=5)
rfr_em_mif_n_data, rfr_em_mif_n_labels = select_most_important_features(rfr_em_features_coefs_df,
method='threshold',
model=rfr_em_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 43)
rfr_em_model_c2 = RandomForestRegressor(random_state=42)
rfr_em_reduced_data_43f, train_and_test_data_reduced_43f = run_training_cycle(rfr_em_mif_n_labels,
rfr_em_params,
rfr_em_model_c2)
# Display naiv decision tree
rfr_em_naiv_model = RandomForestRegressor(n_estimators=350, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_43f['train'])
display_decision_tree(rfr_em_naiv_model,
rfr_em_mif_n_labels,
targets_cols[1],
tree_nb=5)
# Plot training curve
x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]
plot_validation_curve(RandomForestRegressor(n_estimators=350, random_state=42, n_jobs=-1),
x_train_std_em_18f,
y_train_energy,
'n_estimators',
np.arange(200, 800, 200),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(RandomForestRegressor(n_estimators=350, random_state=42, n_jobs=-1),
x_train_std_em_18f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
rfr_em_reduced_data_18f['learning_potential'] = 'Yes'
%%time
# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')
# xgb_reg_params = {'n_estimators': [150, 200, 250],
# 'alpha': [0.5, 2, 5],
# 'learning_rate': np.arange(0.1, 1.1, 0.1),
# 'max_depth': [5, 10, 20],
# 'colsample_bytree': [0.3],
# #'subsample': np.arange(0.1, 1.1, 0.1)
# }
xgb_reg_en_params = {'n_estimators': [500],
'alpha': [0.5], # selected from np.arange(1, 2, 0.1),
'learning_rate': [0.062], # 0.065
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_en_data = train_gridsearch(energy_data,
xgb_reg,
xgb_reg_en_params)
print(xgb_en_data['model'])
xgb_reg_en_model = xgb_en_data['model']
xgb_reg_en_coefs = xgb_reg_en_model.feature_importances_
xgb_reg_en_features_coefs_df = get_features_importance(training_features,
xgb_reg_en_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(xgb_reg_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
xgb_reg_en_mif_72_data, xgb_reg_en_mif_72_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
n=72,
method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_en_mif_72_data,
model_name(xgb_reg_en_model),
n=10,
plot_size=(12, 4))
%%time
# Second training cycle with features reduced (173 -> 72)
xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_reduced_data_72f, train_and_test_data_reduced_72f = run_training_cycle(xgb_reg_en_mif_72_labels,
xgb_reg_en_params,
xgb_reg_en_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_72f['model'], num_trees=5)
plt.show()
xgb_reg_en_mif_n_data, xgb_reg_en_mif_n_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
method='threshold',
model=xgb_reg_en_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 43)
xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_reduced_data_43f, train_and_test_data_reduced_43f = run_training_cycle(xgb_reg_en_mif_n_labels,
xgb_reg_en_params,
xgb_reg_en_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_43f['model'], num_trees=5)
plt.show()
# Plot training curve
x_train_std_en_43f = train_and_test_data_reduced_43f['train'][0]
xgb_reg_en_params = {'objective': 'reg:squarederror',
'n_estimators': 500,
'alpha': 0.5,
'learning_rate': 0.062,
'max_depth': 20,
'colsample_bytree': 0.35}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_43f,
y_train_energy,
'n_estimators',
np.arange(200, 800, 200),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_43f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_en_reduced_data_43f['learning_potential'] = 'Yes'
%%time
# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')
# xgb_reg_params = {'n_estimators': [150, 200, 250, 300],
# 'alpha': [2, 5],
# 'learning_rate': np.arange(0.1, 1.1, 0.1),
# 'max_depth': [5, 10, 20],
# 'colsample_bytree': [0.3],
# 'subsample': np.arange(0.1, 1.1, 0.1)}
xgb_reg_em_params = {'n_estimators': [750],
'alpha': [1.6], # selected from np.arange(1, 2, 0.1),
'learning_rate': [0.05],
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_em_data = train_gridsearch(emissions_data,
xgb_reg,
xgb_reg_em_params)
print(xgb_em_data['model'])
xgb_reg_em_model = xgb_em_data['model']
xgb_reg_em_coefs = xgb_reg_em_model.feature_importances_
xgb_reg_em_features_coefs_df = get_features_importance(training_features,
xgb_reg_em_coefs,
abs_coefs=True,
non_zero_coefs=False,
verbose=False)
plot_cumulative_features_importance(xgb_reg_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
xgb_reg_em_mif_79_data, xgb_reg_em_mif_79_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
n=79,
method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_em_mif_79_data,
model_name(xgb_reg_em_model),
n=10,
plot_size=(12, 4))
%%time
# Second training cycle with features reduced (173 -> 79)
xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_reduced_data_79f, train_and_test_data_reduced_79f = run_training_cycle(xgb_reg_em_mif_79_labels,
xgb_reg_em_params,
xgb_reg_em_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_79f['model'], num_trees=5)
plt.show()
xgb_reg_em_mif_n_data, xgb_reg_em_mif_n_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
method='threshold',
model=xgb_reg_em_model,
thr='q',
q_value=0.75, # Q3
v=True)
# Second training cycle with features reduced (173 -> 43)
xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_reduced_data_43f, train_and_test_data_reduced_43f = run_training_cycle(xgb_reg_em_mif_n_labels,
xgb_reg_em_params,
xgb_reg_em_model_c2)
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_43f['model'], num_trees=5)
plt.show()
# Plot training curve
x_train_std_em_43f = train_and_test_data_reduced_43f['train'][0]
xgb_reg_em_params = {'objective': 'reg:squarederror',
'n_estimators': 750,
'alpha': 1.6,
'learning_rate': 0.05,
'max_depth': 20,
'colsample_bytree': 0.35}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_43f,
y_train_emissions,
'n_estimators',
np.arange(200, 800, 200),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_43f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_em_reduced_data_43f['learning_potential'] = 'Yes'
# Get union from best models selected features for each target
# Energy best selected features
# Merge best selected features from models
en_total_best_features = enet_en_mif_31_labels + rfr_en_mif_12_labels + xgb_reg_en_mif_n_labels
# Extract best unique selected features from merged list
en_best_features = list(set(en_total_best_features))
# Emissions best selected features
# Merge best selected features from models
em_total_best_features = enet_em_mif_n_labels + rfr_em_mif_18_labels + xgb_reg_em_mif_n_labels
# Extract best unique selected features from merged list
em_best_features = list(set(em_total_best_features))
# Display best features count for each target
print('Energy target has {} best features'.format(len(en_best_features)))
print('Emissions target has {} best features'.format(len(em_best_features)))
Nonlinear models do not allow to measure in a simple way feature contributions from models
We will therefore proceed here differently by:
%%time
kernel_svr_model = SVR(kernel='rbf')
kernel_svr_en_params = {'C': [250], # 300 # np.arange(50, 550, 50), np.logspace(-3, 3, 10)
'gamma': [0.001],
'epsilon': [0.1]}
kernel_svr_en_data = train_gridsearch(energy_data,
kernel_svr_model,
kernel_svr_en_params)
"""
- RMSE = 0.538
- R2 = 0.772
"""
kernel_svr_en_model = kernel_svr_en_data['model']
kernel_svr_en_model
%%time
# Second training cycle with features reduced
kernel_svr_en_model = SVR(kernel='rbf')
# We test the different sets of selected features for each model by also adding the union of these
en_selected_feature_sets_dict = {'ElasticNet': enet_en_mif_31_labels,
'RandomForestRegressor': rfr_en_mif_12_labels,
'XGBoostRegressor': xgb_reg_en_mif_n_labels,
'Union of features selected by each model': en_best_features}
def run_training_cycle_for_each_feature_set(feature_sets, model, model_param_grid):
# Second cycle results dictionary
second_cycle_results = {'rmse': [],
'r2': [],
'time': [],
'total selected features': [],
'selected features by': []}
# Run second training cycle for each selected features lists
for model_of_feature_set, selected_features in feature_sets.items():
model_reduced_data_nf, train_and_test_data_reduced_nf = run_training_cycle(selected_features,
model_param_grid,
model,
v=False)
second_cycle_results['rmse'].append(model_reduced_data_nf['rmse'])
second_cycle_results['r2'].append(model_reduced_data_nf['r2'])
second_cycle_results['time'].append(model_reduced_data_nf['time'])
second_cycle_results['total selected features'].append(len(selected_features))
second_cycle_results['selected features by'].append(model_of_feature_set)
# Build second cycle results
second_cycle_results_df = pd.DataFrame(second_cycle_results)
return second_cycle_results_df
kernel_svr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
kernel_svr_en_model,
kernel_svr_en_params)
# Best performances for each feature set
kernel_svr_en_second_cycle_results_df
# Train best Kernel SVR model with appropriate selected features list
kernel_svr_en_reduced_data_12f, train_and_test_data_reduced_12f = run_training_cycle(rfr_en_mif_12_labels,
kernel_svr_en_params,
kernel_svr_en_model,
v=False)
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]
kernel_svr_en_params = {'kernel': 'rbf',
'C': 250,
'gamma': 0.001,
'epsilon': 0.1}
plot_validation_curve(SVR(**kernel_svr_en_params),
x_train_std_en_12f,
y_train_energy,
'C',
np.arange(50, 550, 150),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Fix C hyperparameter to 200
# Train best Kernel SVR model with appropriate selected features list (and C = 200)
kernel_svr_en_params = {'C': [200],
'gamma': [0.001],
'epsilon': [0.1]}
kernel_svr_en_reduced_data_12f, train_and_test_data_reduced_12f = run_training_cycle(rfr_en_mif_12_labels,
kernel_svr_en_params,
kernel_svr_en_model,
v=False)
# Plot learning curve
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]
kernel_svr_en_params = {'kernel': 'rbf',
'C': 200,
'gamma': 0.001,
'epsilon': 0.1}
# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_en_params),
x_train_std_en_12f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
kernel_svr_en_reduced_data_12f['learning_potential'] = 'No'
%%time
kernel_svr_model = SVR(kernel='rbf')
kernel_svr_em_params = {'C': [250], # 300
'gamma': [0.001], # 0.01
'epsilon': [0.1]}
kernel_svr_em_data = train_gridsearch(emissions_data,
kernel_svr_model,
kernel_svr_em_params)
"""
- RMSE = 0.933
- R2 = 0.607
"""
kernel_svr_em_data['model']
%%time
# Second training cycle with features reduced
kernel_svr_em_model = SVR(kernel='rbf')
# We test the different sets of selected features for each model by also adding the union of these
em_selected_feature_sets_dict = {'ElasticNet': enet_em_mif_n_labels,
'RandomForestRegressor': rfr_em_mif_18_labels,
'XGBoostRegressor': xgb_reg_em_mif_n_labels,
'Union of features selected by each model': em_best_features}
kernel_svr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
kernel_svr_em_model,
kernel_svr_em_params)
# Best performances for each feature set
kernel_svr_em_second_cycle_results_df
kernel_svr_em_reduced_data_18f, train_and_test_data_reduced_18f = run_training_cycle(rfr_em_mif_18_labels,
kernel_svr_em_params,
kernel_svr_em_model,
v=False)
x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]
kernel_svr_em_params = {'kernel': 'rbf',
'C': 250,
'gamma': 0.001,
'epsilon': 0.1}
plot_validation_curve(SVR(**kernel_svr_em_params),
x_train_std_em_18f,
y_train_emissions,
'C',
np.arange(50, 550, 150),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
kernel_svr_em_params = {'kernel': 'rbf',
'C': 250,
'gamma': 0.001,
'epsilon': 0.1}
# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_em_params),
x_train_std_em_18f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
kernel_svr_em_reduced_data_18f['learning_potential'] = 'No'
%%time
# Multi-Layer Perceptron Regressor
mlpr_model = MLPRegressor(activation='relu', #'identity',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False)
mlpr_en_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
'learning_rate_init': [0.0008]} # 0.001
mlpr_en_data = train_gridsearch(energy_data,
mlpr_model,
mlpr_en_params)
mlpr_en_data['model']
%%time
# Second training cycle with features reduced
mlpr_en_model = MLPRegressor(activation='relu',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False)
mlpr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
mlpr_en_model,
mlpr_en_params)
# Best performances for each feature set
mlpr_en_second_cycle_results_df
mlpr_en_reduced_data_12f, train_and_test_data_reduced_12f = run_training_cycle(rfr_en_mif_12_labels,
mlpr_en_params,
mlpr_en_model,
v=False)
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]
mlpr_en_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 5000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.0008}
plot_validation_curve(MLPRegressor(**mlpr_en_params),
x_train_std_en_12f,
y_train_energy,
'learning_rate_init',
np.arange(0.0001, 0.001, 0.0001),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Run another training cycle with a learning rate = 0.0001
mlpr_en_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
'learning_rate_init': [0.0001]} # 0.001
mlpr_en_reduced_data_12f, train_and_test_data_reduced_12f = run_training_cycle(rfr_en_mif_12_labels,
mlpr_en_params,
mlpr_en_model,
v=True)
# Plot learning curve
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]
mlpr_en_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 50000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.0001}
# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_en_params),
x_train_std_en_12f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : No
mlpr_en_reduced_data_12f['learning_potential'] = 'No'
%%time
mlpr_model = MLPRegressor(activation='relu', #'identity',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False)
mlpr_em_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
'learning_rate_init': [0.00075]} # 0.001
mlpr_em_data = train_gridsearch(emissions_data,
mlpr_model,
mlpr_em_params)
mlpr_em_data['model']
%%time
# Second training cycle with features reduced
mlpr_em_model = MLPRegressor(activation='relu',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False)
mlpr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
mlpr_em_model,
mlpr_em_params)
# Best performances for each feature set
mlpr_em_second_cycle_results_df
mlpr_em_reduced_data_18f, train_and_test_data_reduced_18f = run_training_cycle(rfr_em_mif_18_labels,
mlpr_em_params,
mlpr_em_model,
v=False)
x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]
mlpr_em_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 5000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.00075}
plot_validation_curve(MLPRegressor(**mlpr_em_params),
x_train_std_em_18f,
y_train_emissions,
'learning_rate_init',
np.arange(0.0001, 0.001, 0.0001),
log_scale=False,
scorer='neg_root_mean_squared_error')
# 0.0005 seems to be a better value for learning rate
mlpr_model = MLPRegressor(activation='relu', #'identity',
learning_rate='adaptive',
alpha=4,
max_iter=5000,
verbose=False)
mlpr_em_params = {'hidden_layer_sizes': (16, 3),
'learning_rate_init': [0.0005]}
mlpr_em_reduced_data_18f, train_and_test_data_reduced_18f = run_training_cycle(rfr_em_mif_18_labels,
mlpr_em_params,
mlpr_em_model,
v=True)
# Plot learning curve
x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]
mlpr_em_params = {'activation': 'relu',
'learning_rate': 'adaptive',
'alpha': 4,
'max_iter': 5000,
'hidden_layer_sizes': (16, 3),
'learning_rate_init': 0.0005}
# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_em_params),
x_train_std_em_18f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
mlpr_em_reduced_data_18f['learning_potential'] = 'Yes'
%%time
# Third training cycle with features reduced
rfr_en_model = RandomForestRegressor(random_state=42)
rfr_en_params = {'n_estimators': [100],
'min_samples_leaf': [1]}
rfr_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
rfr_en_model,
rfr_en_params)
rfr_en_third_cycle_results_df
%%time
# Third training cycle with features reduced
rfr_em_model = RandomForestRegressor(random_state=42)
rfr_em_params = {'n_estimators': [350],
'min_samples_leaf': [1]}
rfr_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
rfr_em_model,
rfr_em_params)
rfr_em_third_cycle_results_df
%%time
# Third training cycle with features reduced
xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_params = {'n_estimators': [500],
'alpha': [0.5],
'learning_rate': [0.062],
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_reg_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
xgb_reg_en_model,
xgb_reg_en_params)
xgb_reg_en_third_cycle_results_df
xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_en_reduced_data_12f, train_and_test_data_reduced_12f = run_training_cycle(rfr_en_mif_12_labels,
xgb_reg_en_params,
xgb_reg_en_model,
v=False)
# Plot training curve
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]
xgb_reg_en_params = {'objective': 'reg:squarederror',
'n_estimators': 500,
'alpha': 0.5,
'learning_rate': 0.062,
'max_depth': 20,
'colsample_bytree': 0.35}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_12f,
y_train_energy,
'n_estimators',
np.arange(200, 800, 100),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
x_train_std_en_12f,
y_train_energy,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_en_reduced_data_12f['learning_potential'] = 'Yes'
%%time
# Third training cycle with features reduced
xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_params = {'n_estimators': [750],
'alpha': [1.6],
'learning_rate': [0.05],
'max_depth': [20],
'colsample_bytree': [0.35]
}
xgb_reg_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
xgb_reg_em_model,
xgb_reg_em_params)
xgb_reg_em_third_cycle_results_df
xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_reg_em_reduced_data_18f, train_and_test_data_reduced_18f = run_training_cycle(rfr_em_mif_18_labels,
xgb_reg_em_params,
xgb_reg_em_model,
v=False)
# Plot training curve
x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]
xgb_reg_em_params = {'objective': 'reg:squarederror',
'n_estimators': 750,
'alpha': 1.6,
'learning_rate': 0.05,
'max_depth': 20,
'colsample_bytree': 0.35
}
plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_18f,
y_train_emissions,
'n_estimators',
np.arange(200, 800, 100),
log_scale=False,
scorer='neg_root_mean_squared_error')
# Plot learning curve
plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
x_train_std_em_18f,
y_train_emissions,
train_sizes_ratio=np.linspace(0.1, 1, 10),
scorer='neg_root_mean_squared_error')
# Potential for improvement : Yes
xgb_reg_em_reduced_data_18f['learning_potential'] = 'Yes'
# Best models for energy target (model data dictionaries from train_gridsearch wrapper)
best_en_models = [enet_en_reduced_data_31f,
rfr_en_reduced_data_12f,
xgb_reg_en_reduced_data_12f, #xgb_reg_en_reduced_data_44f,
kernel_svr_en_reduced_data_12f,
mlpr_en_reduced_data_12f]
# Best models for emissions target (model data dictionaries from train_gridsearch wrapper)
best_em_models = [enet_em_reduced_data_36f,
rfr_em_reduced_data_18f,
xgb_reg_em_reduced_data_18f, #xgb_reg_em_reduced_data_44f,
kernel_svr_em_reduced_data_18f,
mlpr_em_reduced_data_18f]
# Build main evaluation variables list from a random model data dictionary
model_data_keys = dict(list(rfr_en_reduced_data_12f.items())[2:])
# --> ['model_name', 'rmse', 'r2', 'time', 'n_features', 'learning_potential']
# Build main evaluation variables for result dataframes
training_results_keys = ['Model', 'RMSE', 'R2', 'Run time', 'Selected features', 'Learning potential']
# Build result dictionaries for each target
energy_training_results = {k: [] for k in training_results_keys}
emissions_training_results = {k: [] for k in training_results_keys}
# Fill result dictionaries with model data for each target
for en_model, em_model in zip(best_en_models, best_em_models):
for k1, k2 in zip(training_results_keys, model_data_keys):
energy_training_results[k1].append(en_model[k2])
emissions_training_results[k1].append(em_model[k2])
target_dicts = [energy_training_results, emissions_training_results]
# Build result dataframes for each target
en_results_df, em_results_df = [pd.DataFrame(target_dict) for target_dict in target_dicts]
en_results_df
# Best model XGBRegressor
em_results_df
# Best model XGBRegressor
# Energy target ('SiteEnergyUse(kBtu)') results
en_results_data = {'en_results_df': en_results_df,
'en_best_model': xgb_reg_en_reduced_data_12f,
'en_best_features': rfr_en_mif_12_labels}
# Emissions target ('TotalGHGEmissions') results
em_results_data = {'em_results_df': em_results_df,
'em_best_model': xgb_reg_em_reduced_data_18f,
'em_best_features': rfr_em_mif_18_labels}
# Merge target results into another dictionary
results_data = {'en': en_results_data, 'em': em_results_data}
# Save it as a .pkl file
pickle_data(filename='main_results_no_ENERGYSTARScore_no_main_energy_vars',
folder='../data/pkl',
data=results_data,
method='w')
rfr_en_mif_12_labels == rfr_em_mif_18_labels
rfr_en_mif_12_labels
# Plot feature importance from best model
xgb.plot_importance(xgb_reg_en_reduced_data_12f['model'])
# "North_south_dist" --> (built from) Latitude
# "East_west_dist" --> (built from) Longitude
selected_features_energy_target = ['Latitude',
'Longitude',
'PropertyGFATotal',
'BuildingType',
'Neighborhood',
'PrimaryPropertyType',
'SecondLargestPropertyUseTypeGFA']
print(f"Total of variables required : {len(selected_features_energy_target)}")
rfr_em_mif_18_labels
# Plot feature importance from best model
xgb.plot_importance(xgb_reg_em_reduced_data_18f['model'])
# "North_south_dist" --> (built from) Latitude
# "East_west_dist" --> (built from) Longitude
# "OldBuilding" --> (built from) YearBuilt
selected_features_emissions_target = ['Latitude',
'Longitude',
'YearBuilt',
'PropertyGFATotal',
'BuildingType',
'Neighborhood',
'PrimaryPropertyType',
'SecondLargestPropertyUseTypeGFA']
print(f"Total of variables required : {len(selected_features_emissions_target)}")